Loading Packages

library(readr)
library(tibble)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ purrr     1.0.2
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
data_replic <- read_csv("/Users/soniadalal/Desktop/302W/302W Project/data/data and replication code/data_replic.csv")
## New names:
## Rows: 1012 Columns: 27
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): iso3, region, income_group dbl (22): year, homrates_unodc, homrates_who,
## deport_convict, deport_convict... lgl (2): ...19, ...23
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...19`
## • `` -> `...23`
head(data_replic)
## # A tibble: 6 × 27
##   iso3   year region     income_group homrates_unodc homrates_who deport_convict
##   <chr> <dbl> <chr>      <chr>                 <dbl>        <dbl>          <dbl>
## 1 AFG    2010 South Asia Low income             3.81        NA            0.0194
## 2 AFG    2012 South Asia Low income             5.41        NA            0.0265
## 3 ALB    1996 Europe & … Upper middl…          NA            8.36         0.142 
## 4 ALB    1998 Europe & … Upper middl…          NA           31.7          0.159 
## 5 ALB    2000 Europe & … Upper middl…          NA           10.7          0.339 
## 6 ALB    2002 Europe & … Upper middl…          NA            7.13         0.393 
## # ℹ 20 more variables: deport_convict_d <dbl>, deport_convict_lead <dbl>,
## #   deport_nonconvict <dbl>, deport_nonconvict_d <dbl>, corruption <dbl>,
## #   crime_US_weight <dbl>, GDP_growth <dbl>,
## #   GDPpercapita_const2010USD_log <dbl>, gini_mkt <dbl>,
## #   instr_benef_medicaidpregn_lag1 <dbl>, instr_enforce_everify_lag1 <dbl>,
## #   ...19 <lgl>, polity2 <dbl>, pop_sh14 <dbl>, population_log <dbl>,
## #   ...23 <lgl>, remittances_GDP <dbl>, urban_interp <dbl>, …
summary(data_replic)
##      iso3                year         region          income_group      
##  Length:1012        Min.   :1996   Length:1012        Length:1012       
##  Class :character   1st Qu.:2002   Class :character   Class :character  
##  Mode  :character   Median :2006   Mode  :character   Mode  :character  
##                     Mean   :2006                                        
##                     3rd Qu.:2010                                        
##                     Max.   :2014                                        
##                                                                         
##  homrates_unodc    homrates_who    deport_convict      deport_convict_d    
##  Min.   : 0.280   Min.   : 0.000   Min.   :  0.00000   Min.   :-164.00466  
##  1st Qu.: 1.394   1st Qu.: 1.092   1st Qu.:  0.04974   1st Qu.:  -0.04479  
##  Median : 3.390   Median : 2.805   Median :  0.17095   Median :   0.00127  
##  Mean   : 8.731   Mean   : 7.190   Mean   :  6.02820   Mean   :   0.34473  
##  3rd Qu.: 9.279   3rd Qu.: 9.119   3rd Qu.:  1.14709   3rd Qu.:   0.07173  
##  Max.   :92.960   Max.   :74.846   Max.   :175.44412   Max.   :  57.59695  
##  NA's   :398      NA's   :208      NA's   :9           NA's   :53          
##  deport_convict_lead  deport_nonconvict deport_nonconvict_d   corruption   
##  Min.   :-164.00466   Min.   :  0.000   Min.   :-117.9508   Min.   :0.000  
##  1st Qu.:  -0.05009   1st Qu.:  0.120   1st Qu.:  -0.1497   1st Qu.:2.000  
##  Median :   0.00000   Median :  0.392   Median :   0.0000   Median :2.500  
##  Mean   :   0.19557   Mean   :  7.350   Mean   :   0.9367   Mean   :2.981  
##  3rd Qu.:   0.07078   3rd Qu.:  2.470   3rd Qu.:   0.1312   3rd Qu.:4.000  
##  Max.   :  57.59695   Max.   :307.606   Max.   : 196.3743   Max.   :6.000  
##  NA's   :132          NA's   :143       NA's   :155         NA's   :204    
##  crime_US_weight   GDP_growth      GDPpercapita_const2010USD_log
##  Min.   :341.5   Min.   :-16.700   Min.   : 5.755               
##  1st Qu.:407.8   1st Qu.:  1.730   1st Qu.: 8.082               
##  Median :466.6   Median :  3.852   Median : 8.915               
##  Mean   :464.1   Mean   :  3.900   Mean   : 8.949               
##  3rd Qu.:506.0   3rd Qu.:  5.970   3rd Qu.:10.056               
##  Max.   :724.9   Max.   : 34.500   Max.   :11.485               
##  NA's   :234     NA's   :12        NA's   :16                   
##     gini_mkt     instr_benef_medicaidpregn_lag1 instr_enforce_everify_lag1
##  Min.   :27.90   Min.   :0.1484                 Min.   :0.00000           
##  1st Qu.:43.00   1st Qu.:0.6028                 1st Qu.:0.00000           
##  Median :46.90   Median :0.7047                 Median :0.03464           
##  Mean   :46.53   Mean   :0.6826                 Mean   :0.07168           
##  3rd Qu.:50.10   3rd Qu.:0.7784                 3rd Qu.:0.09822           
##  Max.   :68.50   Max.   :0.9633                 Max.   :0.79440           
##  NA's   :132     NA's   :313                    NA's   :313               
##   ...19            polity2          pop_sh14     population_log  
##  Mode:logical   Min.   :-9.000   Min.   :11.06   Min.   : 9.678  
##  NA's:1012      1st Qu.: 5.000   1st Qu.:17.57   1st Qu.:15.036  
##                 Median : 8.000   Median :24.05   Median :15.958  
##                 Mean   : 5.822   Mean   :25.39   Mean   :15.819  
##                 3rd Qu.:10.000   3rd Qu.:31.72   3rd Qu.:17.229  
##                 Max.   :10.000   Max.   :49.87   Max.   :21.034  
##                 NA's   :153      NA's   :37                      
##   ...23         remittances_GDP     urban_interp    war_intrastate    
##  Mode:logical   Min.   : 0.00139   Min.   : 10.30   Min.   :0.000000  
##  NA's:1012      1st Qu.: 0.29699   1st Qu.: 49.17   1st Qu.:0.000000  
##                 Median : 1.39941   Median : 64.65   Median :0.000000  
##                 Mean   : 3.95100   Mean   : 62.13   Mean   :0.006917  
##                 3rd Qu.: 4.83025   3rd Qu.: 77.05   3rd Qu.:0.000000  
##                 Max.   :49.28990   Max.   :100.00   Max.   :1.000000  
##                 NA's   :86                                            
##      yr_sch      
##  Min.   : 2.870  
##  1st Qu.: 7.432  
##  Median : 9.399  
##  Mean   : 9.029  
##  3rd Qu.:10.810  
##  Max.   :16.815  
##  NA's   :183

Remove rows with missing values in key columns, remove NA columns.

df <- data_replic %>%
  select(-...19, -...23)

newdf <- data_replic %>%
  select(-...19, -...23) %>%
  filter(!is.na(deport_nonconvict) & !is.na(homrates_unodc) & !is.na(region) & !is.na(deport_convict) & !is.na(income_group)) 
newdf
## # A tibble: 537 × 25
##    iso3   year region    income_group homrates_unodc homrates_who deport_convict
##    <chr> <dbl> <chr>     <chr>                 <dbl>        <dbl>          <dbl>
##  1 AFG    2010 South As… Low income             3.81        NA            0.0194
##  2 ALB    2006 Europe &… Upper middl…           4.06         0            1.07  
##  3 ALB    2008 Europe &… Upper middl…           3.31         2.87         0.964 
##  4 ALB    2010 Europe &… Upper middl…           3.64         1.15         1.59  
##  5 ALB    2012 Europe &… Upper middl…           5.18        NA            1.27  
##  6 ALB    2014 Europe &… Upper middl…           4.18        NA            0.916 
##  7 ARM    2006 Europe &… Lower middl…           2.28        NA            0.993 
##  8 ARM    2008 Europe &… Lower middl…           2.92        NA            1.32  
##  9 ARM    2010 Europe &… Lower middl…           2.58         1.70         1.04  
## 10 ARM    2012 Europe &… Lower middl…           2.28         1.61         1.18  
## # ℹ 527 more rows
## # ℹ 18 more variables: deport_convict_d <dbl>, deport_convict_lead <dbl>,
## #   deport_nonconvict <dbl>, deport_nonconvict_d <dbl>, corruption <dbl>,
## #   crime_US_weight <dbl>, GDP_growth <dbl>,
## #   GDPpercapita_const2010USD_log <dbl>, gini_mkt <dbl>,
## #   instr_benef_medicaidpregn_lag1 <dbl>, instr_enforce_everify_lag1 <dbl>,
## #   polity2 <dbl>, pop_sh14 <dbl>, population_log <dbl>, …

#Make a correlation plot

install.packages("corrplot")
## The following package(s) will be installed:
## - corrplot [0.92]
## These packages will be installed into "~/Desktop/302W/302W Project/data/data and replication code/renv/library/R-4.2/x86_64-apple-darwin17.0".
## 
## # Installing packages --------------------------------------------------------
## - Installing corrplot ...                       OK [linked from cache]
## Successfully installed 1 package in 11 milliseconds.
library(corrplot)
## corrplot 0.92 loaded
df <- df %>%
  mutate(
    income_group_numeric = case_when(
      income_group == "Low income" ~ 1,
      income_group == "Lower middle income" ~ 2,
      income_group == "Upper middle income" ~ 3,
      income_group == "High income: nonOECD" ~ 4,
      income_group == "High income: OECD" ~ 5,
      TRUE ~ NA_real_  # Handles any cases that don't match the above
    )
  )

General overview of correlations between different variables

# Select relevant variables and remove NA values for correlation analysis
cor_data <- df %>%
  select(corruption, homrates_unodc, GDPpercapita_const2010USD_log, urban_interp, year, deport_convict, deport_nonconvict, polity2, yr_sch, income_group_numeric) %>%
  na.omit()  # Remove rows with NA values

# Calculate the correlation matrix
cor_matrix <- cor(cor_data)
corrplot(cor_matrix, method = "color", type = "upper", order = "hclust",
         addCoef.col = "black",  # Add correlation coefficients to enhance readability
         tl.col = "black", tl.srt = 45,  # Adjust text label color and rotation for better visibility
         diag = FALSE,  # Exclude diagonal elements to avoid redundancy
         number.cex = 0.8,  # Adjust size of correlation coefficients
         tl.cex = 0.8)

Shows that as deportation increases, homicide rates increase but over the years available in the data, homicide rates and deportations rates did not increase significantly.

#Income distribution Counts in dataset for each region

# Bar plot
ggplot(df, aes(x = region, fill = income_group)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Income Distribution Counts for Each Region")

More data on income group is available for Latin America & Caribbean and Europe and Central Asia than other regions. #Convict Deportation distribution in dataset by Region

# Bar plot
ggplot(df, aes(x = region, fill = deport_convict)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Convict Deportation Rate Distribution Counts per Region")
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

More data on convict deportation is available for Latin America & Caribbean and Europe and Central Asia than other regions. #Non-Convict Deportation distribution in dataset by Region

# Bar plot
ggplot(df, aes(x = region, fill = deport_nonconvict)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Non-Convict Deportation Rate Distribution Counts per Region")
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

More data on deportation of nonconvicts is available for Latin America & Caribbean and Europe and Central Asia than other regions. #Corruption distribution in dataset by Region

# Bar plot
ggplot(df, aes(x = region, fill = corruption)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Corruption Score Distribution Counts per Region")
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

More data on corruption is available for Latin America & Caribbean and Europe and Central Asia than other regions. # Boxplot of Corruption Scores for all the Regions

# Filter the dataset to include only relevant variables

corruption_data <- data_replic %>%
  select(region, corruption)

# Summary statistics by region
summary_stats <- corruption_data %>%
  group_by(region) %>%
  summarise(mean_corruption = mean(corruption, na.rm = TRUE),
            median_corruption = median(corruption, na.rm = TRUE),
            min_corruption = min(corruption, na.rm = TRUE),
            max_corruption = max(corruption, na.rm = TRUE))

# Visualization: Box plot of corruption scores by region
boxplot_corruption <- ggplot(corruption_data, aes(x = region, y = corruption, fill = region)) +
  geom_boxplot() +
  labs(title = "Corruption Scores by Region",
       x = "Region",
       y = "Corruption Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for better readability

# Display the summary statistics and box plot
print(summary_stats)
## # A tibble: 8 × 5
##   region         mean_corruption median_corruption min_corruption max_corruption
##   <chr>                    <dbl>             <dbl>          <dbl>          <dbl>
## 1 East Asia & P…            3.24               3              1              5.5
## 2 Europe & Cent…            3.5                3              1              6  
## 3 Latin America…            2.49               2.5            1              5  
## 4 Middle East &…            2.39               2              1              5  
## 5 North America             4.88               5              3.5            6  
## 6 South Asia                2.48               2.5            1              4  
## 7 Sub-Saharan A…            1.93               2              0              3  
## 8 <NA>                      2.5                2.5            2              3
print(boxplot_corruption)
## Warning: Removed 204 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Latin America & Caribbean have more corruption on average than Europe & Central Asia #Income distribution proportions for each region in data with no NA for deportation or homicides.

# Calculate the count per income group within each region
newdf_with_count <- newdf %>%
  group_by(region, income_group) %>%
  summarise(count = n(), .groups = 'drop')
new
## function (Class, ...) 
## {
##     ClassDef <- getClass(Class, where = topenv(parent.frame()))
##     value <- .Call(C_new_object, ClassDef)
##     initialize(value, ...)
## }
## <bytecode: 0x7f7ffc275170>
## <environment: namespace:methods>
# Calculate the total count per region
totals <- newdf_with_count %>%
  group_by(region) %>%
  summarise(total = sum(count), .groups = 'drop')

# Join the totals back to the original data frame
newdf_with_proportions <- newdf_with_count %>%
  left_join(totals, by = "region") %>%
  mutate(proportion = count / total)

# Plot the proportions instead of count
ggplot(newdf_with_proportions, aes(x = region, y = proportion, fill = income_group)) +
  geom_bar(position = "dodge", stat = "identity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylab("Proportion")

Europe & Central Asia has a higher proportion of High income whereas Latin America & Caribbean has a higher proportion of Upper middle income.

#Income distribution proportions for each region with darker colors corresponding to lower income

# Order the income_group factor based on income levels from highest to lowest
newdf_with_proportions$income_group <- factor(
  newdf_with_proportions$income_group, 
  levels = c(
    "Low income",
    "Lower middle income",
    "Upper middle income",
    "High income: nonOECD",
    "High income: OECD" 
    
  ),
  ordered = TRUE
)
ggplot(newdf_with_proportions, aes(x = region, y = proportion, fill = income_group)) +
  geom_bar(stat = "identity", position = position_dodge(preserve = 'single')) +
  scale_fill_brewer(palette = "Blues", direction = -1) + # Use reversed Blues palette for darker colors on lower incomes
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(y = "Proportion", fill = "Income Group")

# Correlational plot with Convict Deportation Rates and Homicide Rates by Region (Scaled and Unscaled)

ggplot(newdf, aes(x = deport_convict, y = homrates_unodc)) +
  geom_point(aes(color = region), alpha = 0.5) +
  geom_smooth(method = "lm") +
  facet_wrap(~ region, scale = "free") +
  labs(title = "Correlation Between Convict Deportation Rates and Homicide Rates by Region",
       x = "Convict Deportation Rate",
       y = "UNODC Homicide Rate")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(newdf, aes(x = deport_convict, y = homrates_unodc)) +
  geom_point(aes(color = region), alpha = 0.5) +
  geom_smooth(method = "lm") +
  facet_wrap(~ region) +
  labs(title = "Correlation Between Convict Deportation Rates and Homicide Rates by Region",
       x = "Non-Convict Deportation Rate",
       y = "UNODC Homicide Rate")
## `geom_smooth()` using formula = 'y ~ x'

correlation_by_region <- newdf %>%
  group_by(region) %>%
  summarize(correlation = cor(deport_convict, homrates_unodc, use = "complete.obs"))

print(correlation_by_region)
## # A tibble: 7 × 2
##   region                     correlation
##   <chr>                            <dbl>
## 1 East Asia & Pacific             0.101 
## 2 Europe & Central Asia           0.0529
## 3 Latin America & Caribbean       0.616 
## 4 Middle East & North Africa     -0.0952
## 5 North America                   0.996 
## 6 South Asia                      0.493 
## 7 Sub-Saharan Africa             -0.0725

Correlational plot with Non-Convict Deportation Rates and Homicide Rates by Region (Scaled and Unscaled)

ggplot(newdf, aes(x = deport_nonconvict, y = homrates_unodc)) +
  geom_point(aes(color = region), alpha = 0.5) +
  geom_smooth(method = "lm") +
  facet_wrap(~ region, scale = "free") +
  labs(title = "Correlation Between Non-Convict Deportation Rates and Homicide Rates by Region",
       x = "Non-Convict Deportation Rate",
       y = "UNODC Homicide Rate")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(newdf, aes(x = deport_nonconvict, y = homrates_unodc)) +
  geom_point(aes(color = region), alpha = 0.5) +
  geom_smooth(method = "lm") +
  facet_wrap(~ region) +
  labs(title = "Correlation Between Non-Convict Deportation Rates and Homicide Rates by Region",
       x = "Non-Convict Deportation Rate",
       y = "UNODC Homicide Rate")
## `geom_smooth()` using formula = 'y ~ x'

correlation_by_region2 <- newdf %>%
  group_by(region) %>%
  summarize(correlation = cor(deport_nonconvict, homrates_unodc, use = "complete.obs"))

print(correlation_by_region2)
## # A tibble: 7 × 2
##   region                     correlation
##   <chr>                            <dbl>
## 1 East Asia & Pacific              0.158
## 2 Europe & Central Asia            0.132
## 3 Latin America & Caribbean        0.576
## 4 Middle East & North Africa      -0.142
## 5 North America                    0.890
## 6 South Asia                       0.364
## 7 Sub-Saharan Africa              -0.178

These correlational plots show the most comprehensive data for non-convicts and convicts for Latin American and the Carribean. For this reason we will forcus on this region. # Distribution of Years in the Dataset

# Preparing the data by ensuring 'year' is available and correctly formatted
newdf <- mutate(newdf, year = as.integer(year))

# Combine into one dataframe for plotting

# Plotting the histogram of year distributions across datasets
ggplot(newdf, aes(x = year)) +
  geom_histogram(position = "identity", alpha = 0.5, binwidth = 1) +
  labs(title = "Distribution of Years", x = "Year", y = "Count") +
  scale_fill_manual(values = c("blue", "red", "green")) +
  theme_minimal()

Data only available from 2004 to 2014. This plus the correlation plot shows a limitation that there hasn’t been much change in the variables which may make it more difficult to find relationships in this time.

# Filter the dataset for Latin America & Caribbean region
latin_america_data <- newdf %>%
  filter(region == "Latin America & Caribbean")

# Impute missing data for odd years using linear interpolation
imputed_data1 <- latin_america_data %>%
  tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
  mutate(homrates_unodc = approx(year[!is.na(homrates_unodc)], homrates_unodc[!is.na(homrates_unodc)], year)$y)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `homrates_unodc = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
# Create the line plot
ggplot(imputed_data1, aes(x = year, y = homrates_unodc)) +
  geom_line() +
  labs(title = "Homicide Rates in Latin America & Caribbean",
       x = "Year",
       y = "Homicide Rates (per 100,000 population)") +
  theme_minimal()

# Filter the dataset for Latin America & Caribbean region
europe_central_asia_data <- newdf %>%
  filter(region == "Europe & Central Asia")

# Impute missing data for odd years using linear interpolation
imputed_data2 <- europe_central_asia_data %>%
  tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
  mutate(homrates_unodc = approx(year[!is.na(homrates_unodc)], homrates_unodc[!is.na(homrates_unodc)], year)$y)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `homrates_unodc = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
# Create the line plot
ggplot(imputed_data2, aes(x = year, y = homrates_unodc)) +
  geom_line() +
  labs(title = "Homicide Rates in Europe and Central Asia",
       x = "Year",
       y = "Homicide Rates (per 100,000 population)") +
  theme_minimal()

combined_data <- rbind(imputed_data1, imputed_data2)
ggplot(combined_data, aes(x = year, y = homrates_unodc, color = region)) +
  geom_line() +
  labs(title = "Homicide Rates in Latin America & Caribbean vs. Europe & Central Asia",
       x = "Year",
       y = "Homicide Rates (per 100,000 population)") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"))

Europe and Central Asia Homicide rates are low and slightly decrease over the years while homicide rates in Latin America and the Caribbean are much higher and increase then decrease. # Make new dataset with just Latin America and the Caribbean

# Filter the dataset for Latin America & Caribbean region
latin_america_data <- newdf %>%
  filter(region == "Latin America & Caribbean")

# Impute missing data for odd years using linear interpolation
imputed_data <- latin_america_data %>%
  tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
  mutate(homrates_unodc = approx(year[!is.na(homrates_unodc)], homrates_unodc[!is.na(homrates_unodc)], year)$y,
         deport_convict = approx(year[!is.na(deport_convict)], deport_convict[!is.na(deport_convict)], year)$y,
         deport_nonconvict = approx(year[!is.na(deport_nonconvict)], deport_nonconvict[!is.na(deport_nonconvict)], year)$y)
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `homrates_unodc = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
# Create the line plot
ggplot(imputed_data, aes(x = year, y = homrates_unodc)) +
  geom_line(aes(color = "Homicide Rates"), size = 1) +
  geom_line(aes(y = deport_convict, color = "Convict Deportation Rates"), linetype = "dashed", size = 1) +
  geom_line(aes(y = deport_nonconvict, color = "Non-Convict Deportation Rates"), linetype = "dotdash", size = 1) +
  labs(title = "Homicide and Deportation Rates in Latin America & Caribbean",
       x = "Year",
       y = "Rates per 100,000 population",
       color = "Legend") +
  scale_color_manual(values = c("Homicide Rates" = "red",
                                "Convict Deportation Rates" = "blue",
                                "Non-Convict Deportation Rates" = "green")) +
  theme_minimal() +
  theme(legend.title = element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Filter data for Europe & Central Asia
europe_data <- newdf %>%
  filter(region == "Europe & Central Asia")

# Impute missing data for odd years using linear interpolation
imputed_europe_data <- europe_data %>%
  tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
  mutate(homrates_unodc = approx(year[!is.na(homrates_unodc)], homrates_unodc[!is.na(homrates_unodc)], year)$y,
         deport_convict = approx(year[!is.na(deport_convict)], deport_convict[!is.na(deport_convict)], year)$y,
         deport_nonconvict = approx(year[!is.na(deport_nonconvict)], deport_nonconvict[!is.na(deport_nonconvict)], year)$y)
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `homrates_unodc = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
# Create the plot for Europe & Central Asia
plot_europe <- ggplot(imputed_europe_data) +
  geom_line(aes(x = year, y = homrates_unodc, color = "Homicide Rates"), size = 1) +
  geom_line(aes(x = year, y = deport_convict, color = "Convict Deportation Rates"), size = 1, linetype = "dashed") +
  geom_line(aes(x = year, y = deport_nonconvict, color = "Non-Convict Deportation Rates"), size = 1, linetype = "dotdash") +
  labs(title = "Homicide and Deportation Rates in Europe & Central Asia",
       x = "Year",
       y = "Rates per 100,000 population",
       color = "Legend") +
  scale_color_manual(values = c("Homicide Rates" = "red",
                                "Convict Deportation Rates" = "blue",
                                "Non-Convict Deportation Rates" = "green")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# Display the plot for Europe & Central Asia
plot_europe

latin_america_tot <- df %>%
  filter(region == "Latin America & Caribbean")

imputed_latin_america_tot <- latin_america_tot %>%
  tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
  mutate(deport_convict = approx(year[!is.na(deport_convict)], deport_convict[!is.na(deport_convict)], year)$y,
         deport_nonconvict = approx(year[!is.na(deport_nonconvict)], deport_nonconvict[!is.na(deport_nonconvict)], year)$y)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `deport_convict = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggplot(imputed_latin_america_tot) +
  geom_line(aes(x = year, y = deport_convict, color = "Convict Deportation Rates"), size = 1) +
  geom_line(aes(x = year, y = deport_nonconvict, color = "Non-Convict Deportation Rates"), size = 1) +
  labs(title = "Deportation Rates in Latin America & Caribbean",
       x = "Year",
       y = "Rates per 100,000 population",
       color = "Legend") +
  scale_color_manual(values = c("Convict Deportation Rates" = "blue",
                                "Non-Convict Deportation Rates" = "green")) +
  theme_minimal() +
  theme(legend.title = element_blank())

europe_tot <- df %>%
  filter(region == "Europe & Central Asia")

imputed_europe_tot <- europe_tot %>%
  tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
  mutate(deport_convict = approx(year[!is.na(deport_convict)], deport_convict[!is.na(deport_convict)], year)$y,
         deport_nonconvict = approx(year[!is.na(deport_nonconvict)], deport_nonconvict[!is.na(deport_nonconvict)], year)$y)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `deport_convict = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggplot(imputed_europe_tot) +
  geom_line(aes(x = year, y = deport_convict, color = "Convict Deportation Rates"), size = 1) +
  geom_line(aes(x = year, y = deport_nonconvict, color = "Non-Convict Deportation Rates"), size = 1) +
  labs(title = "Deportation Rates in Latin America & Caribbean",
       x = "Year",
       y = "Rates per 100,000 population",
       color = "Legend") +
  scale_color_manual(values = c("Convict Deportation Rates" = "blue",
                                "Non-Convict Deportation Rates" = "green")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# Filter the dataset to include only relevant variables and regions
region_data <- data_replic %>%
  filter(region %in% c("Latin America & Caribbean", "Europe & Central Asia")) %>%
  select(region, corruption, homrates_unodc)

# Visualization: Scatter plot of corruption scores vs. homicide rates by region
scatter_plot <- ggplot(region_data, aes(x = corruption, y = homrates_unodc, color = region)) +
  geom_point() +
  labs(title = "Corruption Scores vs. Homicide Rates",
       x = "Corruption Score",
       y = "Homicide Rate",
       color = "Region") +
  theme_minimal()

# Display the scatter plot
print(scatter_plot)
## Warning: Removed 370 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(newdf, aes(x = corruption, y = deport_convict)) +
  geom_point(aes(color = region), alpha = 0.5) +
  geom_smooth(method = "lm") +
  facet_wrap(~ region, scale = "free") +
  labs(title = "Correlation Between Corruption Scores and Convict Deportation Rates by Region",
       x = "Corruption Score",
       y = "Convict Deportation Rate")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 76 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(newdf, aes(x = corruption, y = deport_convict)) +
  geom_point(aes(color = region), alpha = 0.5) +
  geom_smooth(method = "lm") +
  facet_wrap(~ region) +
  labs(title = "Correlation Between Corruption Scores and Convict Deportation Rates by Region",
       x = "Corruption Score",
       y = "Convict Deportation Rate")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 76 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(newdf, aes(x = corruption, y = deport_nonconvict)) +
  geom_point(aes(color = region), alpha = 0.5) +
  geom_smooth(method = "lm") +
  facet_wrap(~ region, scale = "free") +
  labs(title = "Corruption Scores & Non-Convict Deportation Rates by Region",
       x = "Corruption Score",
       y = "Non-Convict Deportation Rate")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 76 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(newdf, aes(x = corruption, y = deport_nonconvict)) +
  geom_point(aes(color = region), alpha = 0.5) +
  geom_smooth(method = "lm") +
  facet_wrap(~ region) +
  labs(title = "Corruption Scores & Non-Convict Deportation Rates by Region",
       x = "Corruption Score",
       y = "Non-Convict Deportation Rate")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 76 rows containing missing values or values outside the scale range
## (`geom_point()`).

mean_rates <- newdf %>%
  group_by(region) %>%
  summarize(mean_homicide_rate = mean(homrates_unodc, na.rm = TRUE),
            mean_deport_convict = mean(deport_convict, na.rm = TRUE),
            mean_deport_nonconvict = mean(deport_nonconvict, na.rm = TRUE))

ggplot(mean_rates, aes(x = region)) +
  geom_bar(aes(y = mean_homicide_rate), stat = "identity", position = "dodge", fill = "blue") +
  labs(title = "Mean Homicide Rates by Region (2004-2014)",
       x = "Region",
       y = "Rate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(mean_rates, aes(x = region)) +
  geom_bar(aes(y = mean_deport_convict), stat = "identity", position = "dodge", fill = "red") +
  labs(title = "Mean Convict Deportation Rates by Region (2004-2014)",
       x = "Region",
       y = "Rate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(mean_rates, aes(x = region)) +
  geom_bar(aes(y = mean_deport_nonconvict), stat = "identity", position = "dodge", fill = "green") +
  labs(title = "Mean Non-Convict Rates by Region (2004-2014)",
       x = "Region",
       y = "Rate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))